#############################################################################################################
## Olivia Chi & Aaron Dow
## GOV2001
## Replication Code
#############################################################################################################

setwd("C:/Users/olc317/Documents/GOV2001")

rm(list = ls())
ls()

library(foreign)
library(MatchIt)
library(cem)
library(Zelig)
library(sandwich)
library(lmtest)
library(stargazer)
library(mvtnorm)
library(Rlab)

########################
# Load data
########################

data <- read.dta("data_for_analysis.dta")

## Clustered Standard Errors
cl   <- function(dat,fm, cluster){
  attach(dat, warn.conflicts = F)
  library(sandwich)
  M <- length(unique(cluster))   
  N <- length(cluster)            
  K <- fm$rank                 
  dfc <- (M/(M-1))*((N-1)/(N-K))  
  uj  <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum));
  vcovCL <- dfc*sandwich(fm, meat=crossprod(uj)/N)
  coeftest(fm, vcovCL) }

# matched.data for original analysis
sample <- data[data$dist_from_cut >= -0.6 & data$dist_from_cut <=0.6,]
# Create a variable for the next GPA, imputing from nextGPA variable
sample$nextGPA_temp <- sample$nextGPA + sample$gpacutoff
# Create binary indicator for improving GPA in next term
sample$improve_GPA[sample$nextGPA_temp > sample$GPA_year1] <- 1
sample$improve_GPA[sample$nextGPA_temp <= sample$GPA_year1] <-0

# Imbalance between treatment and control in original analysis
plot(density(sample$hsgrade_pct[sample$gpalscutoff==1]),col="red",
     xlim = c(0,100),
     main = "Density of HS Grade Percentile",
     lty=2,
     xlab = "HS Grade Percentile")
lines(density(sample$hsgrade_pct[sample$gpalscutoff==0]),col="blue")
legend(55,.022, bty="n",
       c("Good Standing","Under Probation"),
       lty=c(1,2),
       lwd=c(1,1),col=c("blue","red")) 

###################################
# Exact matching
###################################

# Subset of data with variables needed for analysis
matching.sample <- subset(sample, select =
                            c(gpalscutoff,gpaXgpalscutoff,gpaXgpagrcutoff,
                              clustervar,hsgrade_pct,totcredits_year1,
                              age_at_entry,male,english,bpl_north_america,
                              loc_campus1,loc_campus2,loc_campus3,
                              nextGPA,left_school,improve_GPA,gradin4,gradin5,
                              gradin6, dist_from_cut, probation_year1,
                              probation_ever, suspended_ever,lowHS,highHS,
                              female,noenglish))

# Subset of data with no missingness (need non-missingness to run matchit)
matching.sample2 <- subset(sample, select =
                             c(gpalscutoff,gpaXgpalscutoff,gpaXgpagrcutoff,
                               clustervar,hsgrade_pct,totcredits_year1,
                               age_at_entry,male,english,bpl_north_america,
                               loc_campus1,loc_campus2,loc_campus3))


# Pre-imbalance
pre.imbalance <- imbalance(group = matching.sample$gpalscutoff,
                           data=matching.sample, drop=c("gpalscutoff","gpaXgpalscutoff",
                                                        "gpaXgpagrcutoff","clustervar",
                                                        "nextGPA","left_school",
                                                        "improve_GPA","gradin4",
                                                        "gradin5","gradin6", "dist_from_cut",
                                                        "probation_year1", "probation_ever",
                                                        "suspended_ever","lowHS","highHS",
                                                        "female","noenglish"))
pre.imbalance

# Perform exact matching
exact.match <- matchit(formula = gpalscutoff ~ hsgrade_pct + totcredits_year1 +
                         age_at_entry + male + english + bpl_north_america +
                         loc_campus1 + loc_campus2 +loc_campus3,
                       data = matching.sample2, discard="both", method = "exact", 
                       distance = 0)
exact.match

# Get pruned data
data.matched <- match.data(exact.match)

# Rows in matched data
matched.data.rows <- as.vector(rownames(data.matched))

# Rows in the whole sample (all students within bandwidth)
all.rows <- as.vector(rownames(matching.sample2))

# Rows in unmatched data
unmatched.rows<-setdiff(all.rows,matched.data.rows)

# Get unmatched data
unmatched.data <-matching.sample[unmatched.rows,]

# Imbalance of unmatched
unmatched.imbalance <- imbalance(group = unmatched.data$gpalscutoff,
                                 data=unmatched.data, drop=c("gpalscutoff","gpaXgpalscutoff",
                                                             "gpaXgpagrcutoff","clustervar",
                                                             "nextGPA","left_school",
                                                             "improve_GPA","gradin4",
                                                             "gradin5","gradin6", "dist_from_cut",
                                                             "probation_year1", "probation_ever",
                                                             "suspended_ever","lowHS","highHS",
                                                             "female","noenglish"))
unmatched.imbalance

# Get pruned data set from matching.sample with all necessary variables
matched.data <- matching.sample[matched.data.rows,]

# Bind weights to the dataset
matched.data$weights <- data.matched$weights

# Post imbalance
post.imbalance <- imbalance(group = matched.data$gpalscutoff,
                            data=matched.data, weights = data.matched$weights,
                            drop=c("gpalscutoff","gpaXgpalscutoff",
                                   "gpaXgpagrcutoff","clustervar",
                                   "nextGPA","left_school",
                                   "improve_GPA","gradin4",
                                   "gradin5","gradin6", 
                                   "dist_from_cut", "probation_year1",
                                   "probation_ever", "suspended_ever",
                                   "weights","lowHS","highHS","female",
                                   "noenglish"))
post.imbalance             

test1 <- lm(left_school ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
            data = matched.data, weights = weights)
cl(matched.data, test1, matched.data$clustervar)


# Imbalance between treatment and control in our matched data
plot(density(matched.data$hsgrade_pct[matched.data$gpalscutoff==1],
      weights=matched.data$weights[matched.data$gpalscutoff==1]/
      sum(matched.data$weights[matched.data$gpalscutoff==1])),
     col="red",
     xlim = c(0,100),
     main = "Density of HS Grade Percentile",
     lty=2,
     xlab = "HS Grade Percentile")
lines(density(matched.data$hsgrade_pct[matched.data$gpalscutoff==0],
        weights=matched.data$weights[matched.data$gpalscutoff==0]/
        sum(matched.data$weights[matched.data$gpalscutoff==0]))
        ,col="blue")
legend(55,.022, bty="n",
       c("Good Standing","Under Probation"),
       lty=c(1,2),
       lwd=c(1,1),col=c("blue","red")) 


################################################################
# What is going on around the cutoff?
# Probation year 1
# Plots around the cutoff: Linear, Quadratic, and Cubic
################################################################

dist_from_cut_range <- seq(-0.6,.6,0.01)

plot(hist(matched.data$dist_from_cut,breaks=50))

probation_matrix <- matrix(data=NA,nrow=121,ncol=3)

for(j in 1:length(dist_from_cut_range)){
  probation_matrix[j,1] <- dist_from_cut_range[j]
  probation_matrix[j,2] <-
    mean(matched.data$probation_year1
         [matched.data$dist_from_cut>=dist_from_cut_range[j] & 
            matched.data$dist_from_cut<dist_from_cut_range[j+1]])
  probation_matrix[j,3] <- dist_from_cut_range[j]*
    dist_from_cut_range[j]
  
}


## SET UP TO PLOT LINEAR, QUADRATIC, AND CUBICS
matched.data$gpaXlscutoff <- matched.data$gpaXgpalscutoff[matched.data
                                                          $gpaXgpalscutoff<=0]
matched.data$gpaXlscutoff[matched.data$gpaXlscutoff==0] <-NA
matched.data$gpaXlscutoffsq <- matched.data$gpaXlscutoff^2
matched.data$gpaXlscutoffcube <- matched.data$gpaXlscutoff^3

matched.data$gpagrcutoff <- matched.data$gpaXgpagrcutoff[matched.data
                                                         $gpaXgpagrcutoff>=0]
matched.data$gpagrcutoff[matched.data$gpagrcutoff==0] <-NA
matched.data$gpagrcutoffsq <- matched.data$gpagrcutoff^2
matched.data$gpagrcutoffcube <- matched.data$gpagrcutoff^3

# Local Linear

reglinet <- lm(probation_year1~gpaXlscutoff,data=matched.data,weights=weights)
reglinec <- lm(probation_year1~gpagrcutoff,data=matched.data,weights=weights)
xseqt  <-  seq(-.6, 0, len=250)
xseqc <- seq(0,.6,len=250)

plot(probation_matrix[,1],probation_matrix[,2],
     ylab = "Probation Status",
     xlab = "First year GPA minus probation cutoff")
abline(v=0,lty=2)
lines(xseqt, reglinet$coef %*% rbind(1, xseqt), lwd=2, col="red")
lines(xseqc, reglinec$coef %*% rbind(1, xseqc), lwd=2, col="blue")



################################################################
# What is going on around the cutoff?
# Left School
# Plots around the cutoff: Linear, Quadratic, and Cubic
################################################################

dist_from_cut_range <- seq(-0.6,.6,0.01)

plot(hist(matched.data$dist_from_cut,breaks=50))

left_school_matrix <- matrix(data=NA,nrow=121,ncol=3)

for(j in 1:length(dist_from_cut_range)){
  left_school_matrix[j,1] <- dist_from_cut_range[j]
  left_school_matrix[j,2] <-
    mean(matched.data$left_school
         [matched.data$dist_from_cut>=dist_from_cut_range[j] & 
            matched.data$dist_from_cut<dist_from_cut_range[j+1]])
  left_school_matrix[j,3] <- dist_from_cut_range[j]*
    dist_from_cut_range[j]
  
}

left_school_matrix_t <- left_school_matrix[1:60,1:2]
left_school_matrix_c <- left_school_matrix[61:121,1:2]


## SET UP TO PLOT LINEAR, QUADRATIC, AND CUBICS
matched.data$gpaXlscutoff <- matched.data$gpaXgpalscutoff[matched.data
                                                         $gpaXgpalscutoff<=0]
matched.data$gpaXlscutoff[matched.data$gpaXlscutoff==0] <-NA
matched.data$gpaXlscutoffsq <- matched.data$gpaXlscutoff^2
matched.data$gpaXlscutoffcube <- matched.data$gpaXlscutoff^3

matched.data$gpagrcutoff <- matched.data$gpaXgpagrcutoff[matched.data
                                                         $gpaXgpagrcutoff>=0]
matched.data$gpagrcutoff[matched.data$gpagrcutoff==0] <-NA
matched.data$gpagrcutoffsq <- matched.data$gpagrcutoff^2
matched.data$gpagrcutoffcube <- matched.data$gpagrcutoff^3

# Local Linear

reglinet <- lm(left_school~gpaXlscutoff,data=matched.data,weights=weights)
reglinec <- lm(left_school~gpagrcutoff,data=matched.data,weights=weights)
xseqt  <-  seq(-.6, 0, len=250)
xseqc <- seq(0,.6,len=250)

plot(left_school_matrix[,1],left_school_matrix[,2],
     ylab = "Left University Voluntarily",
     xlab = "First year GPA minus probation cutoff")
abline(v=0,lty=2)
lines(xseqt, reglinet$coef %*% rbind(1, xseqt), lwd=2, col="red")
lines(xseqc, reglinec$coef %*% rbind(1, xseqc), lwd=2, col="blue")



# Quadratic
reglinet <- lm(left_school~gpaXlscutoff+gpaXlscutoffsq,data=matched.data,weights=weights)
reglinec <- lm(left_school~gpagrcutoff+gpagrcutoffsq,data=matched.data,weights=weights)
xseqt  <-  seq(-.6, 0, len=250)
xseqc <- seq(0,.6,len=250)

plot(matched.data$dist_from_cut,matched.data$left_school,
     ylab = "Left University Voluntarily",
     xlab = "First year GPA minus probation cutoff",
     ylim = c(0,.1),
     sub = "Quadratic terms")
lines(xseqt, reglinet$coef %*% rbind(1, xseqt,(xseqt)^2), lwd=2, col="red")
lines(xseqc, reglinec$coef %*% rbind(1, xseqc,(xseqc)^2), lwd=2, col="blue")
abline(v=0,lty=2)


plot(left_school_matrix[,1],left_school_matrix[,2],
     ylab = "Left University Voluntarily",
     xlab = "First year GPA minus probation cutoff")
lines(xseqt, reglinet$coef %*% rbind(1, xseqt,(xseqt)^2), lwd=2, col="red")
lines(xseqc, reglinec$coef %*% rbind(1, xseqc,(xseqc)^2), lwd=2, col="blue")
abline(v=0,lty=2)


# Cubed
reglinet <- lm(left_school~gpaXlscutoff+gpaXlscutoffsq+gpaXlscutoffcube,data=matched.data,weights=weights)
reglinec <- lm(left_school~gpagrcutoff+gpagrcutoffsq+gpagrcutoffcube,data=matched.data,weights=weights)
xseqt  <-  seq(-.6, 0, len=250)
xseqc <- seq(0,.6,len=250)


plot(matched.data$dist_from_cut,matched.data$left_school,
     ylab = "Left University Voluntarily",
     xlab = "First year GPA minus probation cutoff",
     ylim = c(0,.1),
     sub = "Cubic terms")
lines(xseqt, reglinet$coef %*% rbind(1, xseqt,(xseqt)^2,xseqt^3), lwd=2, col="red")
lines(xseqc, reglinec$coef %*% rbind(1, xseqc,(xseqc)^2,xseqc^3), lwd=2, col="blue")
abline(v=0,lty=2)

plot(left_school_matrix[,1],left_school_matrix[,2],
     ylab = "Left University Voluntarily",
     xlab = "First year GPA minus probation cutoff")
lines(xseqt, reglinet$coef %*% rbind(1, xseqt,(xseqt)^2,xseqt^3), lwd=2, col="red")
lines(xseqc, reglinec$coef %*% rbind(1, xseqc,(xseqc)^2,xseqc^3), lwd=2, col="blue")
abline(v=0,lty=2)

# Linear plot, larger bins

dist_from_cut_range <- seq(-0.6,.6,0.05)

left_school_matrix <- matrix(data=NA,nrow=25,ncol=3)

for(j in 1:length(dist_from_cut_range)){
  left_school_matrix[j,1] <- dist_from_cut_range[j]
  left_school_matrix[j,2] <-
    mean(matched.data$left_school
         [matched.data$dist_from_cut>=dist_from_cut_range[j] & 
            matched.data$dist_from_cut<dist_from_cut_range[j+1]])
  left_school_matrix[j,3] <- dist_from_cut_range[j]*
    dist_from_cut_range[j]
  
}

left_school_matrix_t <- left_school_matrix[1:12,1:2]
left_school_matrix_c <- left_school_matrix[13:25,1:2]


reglinet <- lm(left_school~gpaXlscutoff,data=matched.data,weights=weights)
reglinec <- lm(left_school~gpagrcutoff,data=matched.data,weights=weights)
xseqt  <-  seq(-.6, 0, len=250)
xseqc <- seq(0,.6,len=250)

plot(left_school_matrix[,1],left_school_matrix[,2],
     ylab = "Left University Voluntarily",
     xlab = "First year GPA minus probation cutoff",
     ylim = c(0,.15))
abline(v=0,lty=2)
lines(xseqt, reglinet$coef %*% rbind(1, xseqt), lwd=2, col="red")
lines(xseqc, reglinec$coef %*% rbind(1, xseqc), lwd=2, col="blue")


################################################################
# What is going on around the cutoff?
# Subsequent GPA
# Plots around the cutoff: Linear, Quadratic, and Cubic
################################################################

dist_from_cut_range <- seq(-0.6,.6,0.01)

next_gpa_matrix <- matrix(data=NA,nrow=121,ncol=3)


for(j in 1:length(dist_from_cut_range)){
  next_gpa_matrix[j,1] <- dist_from_cut_range[j]
  next_gpa_matrix[j,2] <-
    mean(matched.data$nextGPA
         [matched.data$dist_from_cut>=dist_from_cut_range[j] & 
            matched.data$dist_from_cut<dist_from_cut_range[j+1] &
           is.na(matched.data$nextGPA)==0])
  next_gpa_matrix[j,3] <- dist_from_cut_range[j]*
    dist_from_cut_range[j]
  
}

next_gpa_matrix_t <- next_gpa_matrix[1:60,1:2]
next_gpa_matrix_c <- next_gpa_matrix[61:121,1:2]


## SET UP TO PLOT LINEAR, QUADRATIC, AND CUBICS
matched.data$gpaXlscutoff <- matched.data$gpaXgpalscutoff[matched.data
                                                          $gpaXgpalscutoff<=0]
matched.data$gpaXlscutoff[matched.data$gpaXlscutoff==0] <-NA
matched.data$gpaXlscutoffsq <- matched.data$gpaXlscutoff^2
matched.data$gpaXlscutoffcube <- matched.data$gpaXlscutoff^3

matched.data$gpagrcutoff <- matched.data$gpaXgpagrcutoff[matched.data
                                                         $gpaXgpagrcutoff>=0]
matched.data$gpagrcutoff[matched.data$gpagrcutoff==0] <-NA
matched.data$gpagrcutoffsq <- matched.data$gpagrcutoff^2
matched.data$gpagrcutoffcube <- matched.data$gpagrcutoff^3

# Local Linear

reglinet <- lm(nextGPA~gpaXlscutoff,data=matched.data,weights=weights)
reglinec <- lm(nextGPA~gpagrcutoff,data=matched.data,weights=weights)
xseqt  <-  seq(-.6, 0, len=250)
xseqc <- seq(0,.6,len=250)

plot(next_gpa_matrix[,1],next_gpa_matrix[,2],
     ylab = "Subsequent GPA minus cutoff",
     xlab = "First year GPA minus probation cutoff")
abline(v=0,lty=2)
lines(xseqt, reglinet$coef %*% rbind(1, xseqt), lwd=2, col="red")
lines(xseqc, reglinec$coef %*% rbind(1, xseqc), lwd=2, col="blue")


################################################################
# What is going on around the cutoff?
# Graduation in 4, 5, 6
# Plots around the cutoff: Linear, Quadratic, and Cubic
################################################################

dist_from_cut_range <- seq(-0.6,.6,0.05)

# Grad in 4
gradin4_matrix <- matrix(data=NA,nrow=25,ncol=3)

for(j in 1:length(dist_from_cut_range)){
  gradin4_matrix[j,1] <- dist_from_cut_range[j]
  gradin4_matrix[j,2] <-
    mean(matched.data$gradin4
         [matched.data$dist_from_cut>=dist_from_cut_range[j] & 
            matched.data$dist_from_cut<dist_from_cut_range[j+1] &
            is.na(matched.data$gradin4)==0])
  gradin4_matrix[j,3] <- dist_from_cut_range[j]*
    dist_from_cut_range[j]
  
}


# Grad in 5
gradin5_matrix <- matrix(data=NA,nrow=25,ncol=3)

for(j in 1:length(dist_from_cut_range)){
  gradin5_matrix[j,1] <- dist_from_cut_range[j]
  gradin5_matrix[j,2] <-
    mean(matched.data$gradin5
         [matched.data$dist_from_cut>=dist_from_cut_range[j] & 
            matched.data$dist_from_cut<dist_from_cut_range[j+1] &
            is.na(matched.data$gradin5)==0])
  gradin5_matrix[j,3] <- dist_from_cut_range[j]*
    dist_from_cut_range[j]
  
}

# Grad in 6
gradin6_matrix <- matrix(data=NA,nrow=25,ncol=3)

for(j in 1:length(dist_from_cut_range)){
  gradin6_matrix[j,1] <- dist_from_cut_range[j]
  gradin6_matrix[j,2] <-
    mean(matched.data$gradin6
         [matched.data$dist_from_cut>=dist_from_cut_range[j] & 
            matched.data$dist_from_cut<dist_from_cut_range[j+1] &
            is.na(matched.data$gradin6)==0])
  gradin6_matrix[j,3] <- dist_from_cut_range[j]*
    dist_from_cut_range[j]
  
}

## SET UP TO PLOT LINEAR, QUADRATIC, AND CUBICS
matched.data$gpaXlscutoff <- matched.data$gpaXgpalscutoff[matched.data
                                                          $gpaXgpalscutoff<=0]
matched.data$gpaXlscutoff[matched.data$gpaXlscutoff==0] <-NA
matched.data$gpaXlscutoffsq <- matched.data$gpaXlscutoff^2
matched.data$gpaXlscutoffcube <- matched.data$gpaXlscutoff^3

matched.data$gpagrcutoff <- matched.data$gpaXgpagrcutoff[matched.data
                                                         $gpaXgpagrcutoff>=0]
matched.data$gpagrcutoff[matched.data$gpagrcutoff==0] <-NA
matched.data$gpagrcutoffsq <- matched.data$gpagrcutoff^2
matched.data$gpagrcutoffcube <- matched.data$gpagrcutoff^3

# Local Linear

reglinet6 <- lm(gradin6~gpaXlscutoff,data=matched.data,weights=weights)
reglinec6 <- lm(gradin6~gpagrcutoff,data=matched.data,weights=weights)


reglinet5 <- lm(gradin5~gpaXlscutoff,data=matched.data,weights=weights)
reglinec5 <- lm(gradin5~gpagrcutoff,data=matched.data,weights=weights)


reglinet4 <- lm(gradin4~gpaXlscutoff,data=matched.data,weights=weights)
reglinec4 <- lm(gradin4~gpagrcutoff,data=matched.data,weights=weights)

xseqt  <-  seq(-.6, 0, len=250)
xseqc <- seq(0,.6,len=250)


grad.graph <- as.data.frame(rbind(gradin4_matrix,
                                  gradin5_matrix,
                                  gradin6_matrix))
grad_years <- c(rep("Grad in 4",25),rep("Grad in 5",25),
                rep("Grad in 6",25))

grad.graph <- cbind(grad.graph,grad_years)

# Plot
par(xpd=FALSE)
plot(gradin6_matrix[,1],gradin6_matrix[,2],
     ylab = "Has Graduated",
     xlab = "First year GPA minus probation cutoff",
     col= "green",
     ylim = c(0,1))
points(gradin5_matrix[,1],gradin5_matrix[,2],pch=22,col="darkblue")
points(gradin4_matrix[,1],gradin4_matrix[,2],pch=2,col="darkred")
abline(v=0,lty=2)
lines(xseqt, reglinet6$coef %*% rbind(1, xseqt), lwd=2, col="green")
lines(xseqc, reglinec6$coef %*% rbind(1, xseqc), lwd=2, col="green")
lines(xseqt, reglinet5$coef %*% rbind(1, xseqt), lwd=2, col="darkblue")
lines(xseqc, reglinec5$coef %*% rbind(1, xseqc), lwd=2, col="darkblue")
lines(xseqt, reglinet4$coef %*% rbind(1, xseqt), lwd=2, col="darkred")
lines(xseqc, reglinec4$coef %*% rbind(1, xseqc), lwd=2, col="darkred")
par(xpd=TRUE)
legend("top", inset=-.3, ncol=3,
       c("Within \n   6 \n years", "Within \n   5 \n years",
                 "Within \n   4 \n years"), pch=c(1,22,20),
       bty="n", lwd=2,
       cex=.6, col=c("green", "darkblue", "darkred"))

colnames(matched.data)
matched.data <- matched.data[,1:28]

###################################
# Table 1 Summary Statistics
###################################

# Function to display mean and SD
meanstdv <- function(vector) {
  m <- mean(vector, na.rm=TRUE)
  s <- sd(vector, na.rm=TRUE)
  return(c(m,s))
}

# Output displays results in Table 1

# Characteristics
meanstdv(matched.data$hsgrade_pct)
meanstdv(matched.data$totcredits_year1)
meanstdv(matched.data$age_at_entry)
meanstdv(matched.data$male)
meanstdv(matched.data$english)
meanstdv(matched.data$bpl_north_america)
meanstdv(matched.data$loc_campus1)
meanstdv(matched.data$loc_campus2)
meanstdv(matched.data$loc_campus3)

# Outcomes
meanstdv(matched.data$dist_from_cut)
meanstdv(matched.data$probation_year1)
meanstdv(matched.data$probation_ever)
meanstdv(matched.data$left_school)
meanstdv(matched.data$nextGPA)
meanstdv(matched.data$suspended_ever)
meanstdv(matched.data$gradin4)
meanstdv(matched.data$gradin5)
meanstdv(matched.data$gradin6)

# Subset of data for summary statistics
summary_stat_data <- cbind(matched.data$hsgrade_pct,matched.data$totcredits_year1, matched.data$age_at_entry, matched.data$male, 
                           matched.data$english, matched.data$bpl_north_america, matched.data$loc_campus1, matched.data$loc_campus2,
                           matched.data$loc_campus3, matched.data$dist_from_cut, matched.data$probation_year1, matched.data$probation_ever, 
                           matched.data$left_school, matched.data$nextGPA, matched.data$suspended_ever, matched.data$gradin4, matched.data$gradin5,
                           matched.data$gradin6)

summary_stat_data <- as.data.frame(summary_stat_data)

variable.names<-c("High school grade percentile", "Credits attempted in first year", "Age at entry" , "Male", "English is first language",
                  "Born in North America", "At Campus 1", "At Campus 2", "At Campus 3", "Distance from cutoff in 1st year", 
                  "On probation after 1st year", "Ever on academic probation", "Left university after 1st evaluation", 
                  "Distance from cutoff at next evaluation", "Ever suspended", "Graduated by year 4", 
                  "Graduated by year 5", "Graduated by year 6")

colnames(summary_stat_data) <- variable.names

stargazer(summary_stat_data, digits=2, title="SUMMARY STATISTICS", nobs=F, min.max=F)


test1 <- lm(left_school ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
            data = matched.data, weights = weights)
cl(matched.data, test1, matched.data$clustervar)


#####################################################################
# Table 2 - Estimated Discontinuities in Observable Characteristics
#####################################################################

# Set up regression models
col1 <- lm(hsgrade_pct ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
           data = matched.data, weights=weights)
col2 <- lm(totcredits_year1 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
           data = matched.data, weights=weights)
col3 <- lm(age_at_entry ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
           data = matched.data, weights=weights)
col4 <- lm(male ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
           data = matched.data, weights=weights)
col5 <- lm(bpl_north_america ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
           data = matched.data, weights=weights)
col6 <- lm(english ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
           data = matched.data, weights=weights)
col7 <- lm(loc_campus1 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
           data = matched.data, weights=weights)
col8 <- lm(loc_campus2 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
           data = matched.data, weights=weights)
col9 <- lm(loc_campus3 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
           data = matched.data, weights=weights)

# Output displays results in Table 2

cl1 <- cl(matched.data, col1, matched.data$clustervar)
cl2 <- cl(matched.data, col2, matched.data$clustervar)
cl3 <- cl(matched.data, col3, matched.data$clustervar)
cl4 <- cl(matched.data, col4, matched.data$clustervar)
cl5 <- cl(matched.data, col5, matched.data$clustervar)
cl6 <- cl(matched.data, col6, matched.data$clustervar)
cl7 <- cl(matched.data, col7, matched.data$clustervar)
cl8 <- cl(matched.data, col8, matched.data$clustervar)
cl9 <- cl(matched.data, col9, matched.data$clustervar)

se <- list(cl1[,2],cl2[,2],cl3[,2],cl4[,2],cl5[,2],cl6[,2],cl7[,2],cl8[,2],cl9[,2])
p <- list(cl1[,4],cl2[,4],cl3[,4],cl4[,4],cl5[,4],cl6[,4],cl7[,4],cl8[,4],cl9[,4])
stargazer(col1,col2,col3,col4,col5,col6,col7,col8,col9, se=se, p=p)


###############################################################################
# Table 3 - Estimated Discontinuity in Probation Status
# Panel A: Dependent variable: On academic probation after first evaluation
###############################################################################

# Create subgroups
lowHS <- matched.data[matched.data$lowHS == 1, ]
highHS <- matched.data[matched.data$highHS == 1, ]
male <- matched.data[matched.data$male == 1, ]
female <- matched.data[matched.data$female == 1, ]
english <- matched.data[matched.data$english == 1, ]
noenglish <- matched.data[matched.data$noenglish == 1, ]

# Set up regression models
table3col1 <- lm(probation_year1 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = matched.data, weights=weights)

table3col2 <- lm(probation_year1 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = lowHS, weights=weights)

table3col3 <- lm(probation_year1 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = highHS, weights=weights)

table3col4 <- lm(probation_year1 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = male, weights=weights)

table3col5 <- lm(probation_year1 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = female, weights=weights)

table3col6 <- lm(probation_year1 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = english, weights=weights)

table3col7 <- lm(probation_year1 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = noenglish, weights=weights)


# Output displays results in Table 3 Panel A
cl31 <- cl(matched.data, table3col1, matched.data$clustervar)
cl32 <- cl(lowHS, table3col2, lowHS$clustervar)
cl33 <- cl(lowHS, table3col3, highHS$clustervar)
cl34 <- cl(male, table3col4, male$clustervar)
cl35 <- cl(female, table3col5, female$clustervar)
cl36 <- cl(english, table3col6, english$clustervar)
cl37 <- cl(noenglish, table3col7, noenglish$clustervar)


se <- list(cl31[,2],cl32[,2],cl33[,2],cl34[,2],cl35[,2],cl36[,2],cl37[,2])
p <- list(cl31[,4],cl32[,4],cl33[,4],cl34[,4],cl35[,4],cl36[,4],cl37[,4])
stargazer(table3col1,table3col2,table3col3,table3col4,table3col5,table3col6,table3col7, se=se, p=p)

###############################################################################
# Table 3 - Estimated Discontinuity in Probation Status
# Panel B: Dependent variable: Ever on academic probation
###############################################################################

# Set up regression models
table3col1b <- lm(probation_ever ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = matched.data, weights=weights)

table3col2b <- lm(probation_ever ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = lowHS, weights=weights)

table3col3b <- lm(probation_ever ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = highHS, weights=weights)

table3col4b <- lm(probation_ever ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = male, weights=weights)

table3col5b <- lm(probation_ever ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = female, weights=weights)

table3col6b <- lm(probation_ever ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = english, weights=weights)

table3col7b <- lm(probation_ever ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = noenglish, weights=weights)

# Output displays results in Table 3 Panel B
cl31b <- cl(matched.data, table3col1b, matched.data$clustervar)
cl32b <- cl(lowHS, table3col2b, lowHS$clustervar)
cl33b <- cl(lowHS, table3col3b, highHS$clustervar)
cl34b <- cl(male, table3col4b, male$clustervar)
cl35b <- cl(female, table3col5b, female$clustervar)
cl36b <- cl(english, table3col6b, english$clustervar)
cl37b <- cl(noenglish, table3col7b, noenglish$clustervar)

se <- list(cl31b[,2],cl32b[,2],cl33b[,2],cl34b[,2],cl35b[,2],cl36b[,2],cl37b[,2])
p <- list(cl31b[,4],cl32b[,4],cl33b[,4],cl34b[,4],cl35b[,4],cl36b[,4],cl37b[,4])
stargazer(table3col1b,table3col2b,table3col3b,table3col4b,table3col5b,table3col6b,table3col7b, se=se, p=p)


###############################################################################
# Table 4 - Estimated Effect on the Decision to Leave After the First Evaluation
###############################################################################

# Set up regression models
table4col1 <- lm(left_school ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = matched.data, weights=weights)

table4col2 <- lm(left_school ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = lowHS, weights=weights)

table4col3 <- lm(left_school ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = highHS, weights=weights)


table4col4 <-lm(left_school ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                data = male, weights=weights)


table4col5 <-lm(left_school ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                data = female, weights=weights)


table4col6 <-lm(left_school ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                data = english, weights=weights)


table4col7 <-lm(left_school ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                data = noenglish, weights=weights)

# Output displays results in Table 4
cl41 <- cl(matched.data, table4col1, matched.data$clustervar)
cl42 <- cl(lowHS, table4col2, lowHS$clustervar )
cl43 <- cl(highHS, table4col3, highHS$clustervar)
cl44 <- cl(male, table4col4, male$clustervar)
cl45 <- cl(female, table4col5, female$clustervar)
cl46 <- cl(english, table4col6, english$clustervar)
cl47 <- cl(noenglish, table4col7, noenglish$clustervar)


se <- list(cl41[,2],cl42[,2],cl43[,2],cl44[,2],cl45[,2],cl46[,2],cl47[,2])
p <- list(cl41[,4],cl42[,4],cl43[,4],cl44[,4],cl45[,4],cl46[,4],cl47[,4])
stargazer(table4col1,table4col2,table4col3,table4col4,table4col5,table4col6,table4col7, se=se, p=p)



###############################################################################
# Table 5 - Estimated Discontinuities in Subsequent GPA
# Panel A: Dependent variable: Next term GPA
###############################################################################

# Set up the subsets for each column & set up regression models
# All (1)
matched.data2 <- subset(matched.data, select = c(nextGPA, gpalscutoff,
                                                 gpaXgpalscutoff, gpaXgpagrcutoff,
                                                 clustervar, weights))
matched.data2 <- na.omit(matched.data2)

table5col1 <- lm(nextGPA ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = matched.data2, weights=weights)


# HS grades < median
lowHS2 <- subset(lowHS, select = c(nextGPA, gpalscutoff,
                                   gpaXgpalscutoff, gpaXgpagrcutoff,
                                   clustervar, weights))
lowHS2 <- na.omit(lowHS2)
table5col2 <- lm(nextGPA ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = lowHS2, weights=weights)


# HS grades > median
highHS2 <- subset(highHS, select = c(nextGPA, gpalscutoff,
                                     gpaXgpalscutoff, gpaXgpagrcutoff,
                                     clustervar, weights))
highHS2 <- na.omit(highHS2)
table5col3 <- lm(nextGPA ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = highHS2, weights=weights)

# Male
male2 <- subset(male, select = c(nextGPA, gpalscutoff,
                                 gpaXgpalscutoff, gpaXgpagrcutoff,
                                 clustervar, weights))
male2 <- na.omit(male2)
table5col4 <- lm(nextGPA ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = male2, weights=weights)


# Female
female2 <- subset(female, select = c(nextGPA, gpalscutoff,
                                     gpaXgpalscutoff, gpaXgpagrcutoff,
                                     clustervar, weights))
female2 <- na.omit(female2)
table5col5 <- lm(nextGPA ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = female2, weights=weights)


# Native English
english2 <- subset(english, select = c(nextGPA, gpalscutoff,
                                       gpaXgpalscutoff, gpaXgpagrcutoff,
                                       clustervar, weights))
english2 <- na.omit(english2)
table5col6 <- lm(nextGPA ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = english2, weights=weights)


# Nonnative English
noenglish2 <- subset(noenglish, select = c(nextGPA, gpalscutoff,
                                           gpaXgpalscutoff, gpaXgpagrcutoff,
                                           clustervar, weights))
noenglish2 <- na.omit(noenglish2)
table5col7 <- lm(nextGPA ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = noenglish2, weights=weights)


# Output displays results in Table 5 Panel A
cl51 <- cl(matched.data2, table5col1, matched.data2$clustervar)
cl52 <- cl(lowHS2, table5col2, lowHS2$clustervar)
cl53 <- cl(highHS2, table5col3, highHS2$clustervar)
cl54 <- cl(male2, table5col4, male2$clustervar)
cl55 <- cl(female2, table5col5, female2$clustervar)
cl56 <- cl(english2, table5col6, english2$clustervar)
cl57 <- cl(noenglish2, table5col7, noenglish2$clustervar)


se <- list(cl51[,2],cl52[,2],cl53[,2],cl54[,2],cl55[,2],cl56[,2],cl57[,2])
p <- list(cl51[,4],cl52[,4],cl53[,4],cl54[,4],cl55[,4],cl56[,4],cl57[,4])
stargazer(table5col1,table5col2,table5col3,table5col4,table5col5,table5col6,table5col7, se=se, p=p)


###############################################################################
# Table 5 - Estimated Discontinuities in Subsequent GPA
# Panel B: Dependent variable: Probability of improving GPA in next term
###############################################################################

# The "nextGPA" variable seems to have been transformed from an actual GPA
# to the next GPA's distance from the cutoff.  Reverse the operation to get 
# the next GPA, and then create binary indicator on whether or not it was 
# greater than GPA_year1.

# Due to loss of accuracy from using "nextGPA" variable, which seems to have been
# rounded, results do not perfectly match results in article.

# Set up the subsets for each column & set up regression models
matched.data2b <- subset(matched.data, select = c(improve_GPA, gpalscutoff,
                                                  gpaXgpalscutoff, gpaXgpagrcutoff,
                                                  clustervar, weights))
matched.data2b <- na.omit(matched.data2b)

table5col1b <- lm(improve_GPA ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = matched.data2b, weights=weights)

# HS grades < median
lowHS2b <- subset(lowHS, select = c(improve_GPA, gpalscutoff,
                                    gpaXgpalscutoff, gpaXgpagrcutoff,
                                    clustervar, weights))
lowHS2b <- na.omit(lowHS2b)
table5col2b <- lm(improve_GPA ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = lowHS2b, weights=weights)


# HS grades > median
highHS2b <- subset(highHS, select = c(improve_GPA, gpalscutoff,
                                      gpaXgpalscutoff, gpaXgpagrcutoff,
                                      clustervar, weights))
highHS2b <- na.omit(highHS2b)
table5col3b <- lm(improve_GPA ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = highHS2b, weights=weights)

# Male
male2b <- subset(male, select = c(improve_GPA, gpalscutoff,
                                  gpaXgpalscutoff, gpaXgpagrcutoff,
                                  clustervar, weights))
male2b <- na.omit(male2b)
table5col4b <- lm(improve_GPA ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = male2b, weights=weights)


# Female
female2b <- subset(female, select = c(improve_GPA, gpalscutoff,
                                      gpaXgpalscutoff, gpaXgpagrcutoff,
                                      clustervar, weights))
female2b <- na.omit(female2b)
table5col5b <- lm(improve_GPA ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = female2b, weights=weights)


# Native English
english2b <- subset(english, select = c(improve_GPA, gpalscutoff,
                                        gpaXgpalscutoff, gpaXgpagrcutoff,
                                        clustervar, weights))
english2b <- na.omit(english2b)
table5col6b <- lm(improve_GPA ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = english2b, weights=weights)


# Nonnative English
noenglish2b <- subset(noenglish, select = c(improve_GPA, gpalscutoff,
                                            gpaXgpalscutoff, gpaXgpagrcutoff,
                                            clustervar, weights))
noenglish2b <- na.omit(noenglish2b)
table5col7b <- lm(improve_GPA ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = noenglish2b, weights=weights)


# Output displays results in Table 5 Panel B
cl51b <- cl(matched.data2b, table5col1b, matched.data2b$clustervar)
cl52b <- cl(lowHS2b, table5col2b, lowHS2b$clustervar)
cl53b <- cl(highHS2b, table5col3b, highHS2b$clustervar)
cl54b <- cl(male2b, table5col4b, male2b$clustervar)
cl55b <- cl(female2b, table5col5b, female2b$clustervar)
cl56b <- cl(english2b, table5col6b, english2b$clustervar)
cl57b <- cl(noenglish2b, table5col7b, noenglish2b$clustervar)


se <- list(cl51b[,2],cl52b[,2],cl53b[,2],cl54b[,2],cl55b[,2],cl56b[,2],cl57b[,2])
p <- list(cl51b[,4],cl52b[,4],cl53b[,4],cl54b[,4],cl55b[,4],cl56b[,4],cl57b[,4])
stargazer(table5col1b,table5col2b,table5col3b,table5col4b,table5col5b,table5col6b,table5col7b, se=se, p=p)


###############################################################################
# Table 6 - Estimated Effects on Graduation
# Panel A: Dependent variable: graduated after four years
###############################################################################

# Set up the subsets for each column & set up regression models

# All
matched.data6a <- subset(matched.data, select = c(gradin4, gpalscutoff,
                                                  gpaXgpalscutoff, gpaXgpagrcutoff,
                                                  clustervar, weights))
matched.data6a <- na.omit(matched.data6a)
table6col1 <- lm(gradin4 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = matched.data6a, weights=weights)


# HS grades < median
lowHS6a <- subset(lowHS, select = c(gradin4, gpalscutoff,
                                    gpaXgpalscutoff, gpaXgpagrcutoff,
                                    clustervar, weights))
lowHS6a <- na.omit(lowHS6a)
table6col2 <- lm(gradin4 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = lowHS6a, weights=weights)


# HS grades > median
highHS6a <- subset(highHS, select = c(gradin4, gpalscutoff,
                                      gpaXgpalscutoff, gpaXgpagrcutoff,
                                      clustervar, weights))
highHS6a <- na.omit(highHS6a)
table6col3 <- lm(gradin4 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = highHS6a, weights=weights)


# Male
male6a <- subset(male, select = c(gradin4, gpalscutoff,
                                  gpaXgpalscutoff, gpaXgpagrcutoff,
                                  clustervar, weights))
male6a <- na.omit(male6a)
table6col4 <- lm(gradin4 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = male6a, weights=weights)


# Female
female6a <- subset(female, select = c(gradin4, gpalscutoff,
                                      gpaXgpalscutoff, gpaXgpagrcutoff,
                                      clustervar, weights))
female6a <- na.omit(female6a)
table6col5 <- lm(gradin4 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = female6a, weights=weights)


# English
english6a <- subset(english, select = c(gradin4, gpalscutoff,
                                        gpaXgpalscutoff, gpaXgpagrcutoff,
                                        clustervar, weights))
english6a <- na.omit(english6a)
table6col6 <- lm(gradin4 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = english6a, weights=weights)


# Nonnative English
noenglish6a <- subset(noenglish, select = c(gradin4, gpalscutoff,
                                            gpaXgpalscutoff, gpaXgpagrcutoff,
                                            clustervar, weights))
noenglish6a <- na.omit(noenglish6a)
table6col7 <- lm(gradin4 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                 data = noenglish6a, weights=weights)

# Output displays results in Table 6 Panel A
cl61a <- cl(matched.data6a, table6col1, matched.data6a$clustervar)
cl62a <- cl(lowHS6a, table6col2, lowHS6a$clustervar)
cl63a <- cl(highHS6a, table6col3, highHS6a$clustervar)
cl64a <- cl(male6a, table6col4, male6a$clustervar)
cl65a <- cl(female6a, table6col5, female6a$clustervar)
cl66a <- cl(english6a, table6col6, english6a$clustervar)
cl67a <- cl(noenglish6a, table6col7, noenglish6a$clustervar)


se <- list(cl61a[,2],cl62a[,2],cl63a[,2],cl64a[,2],cl65a[,2],cl66a[,2],cl67a[,2])
p <- list(cl61a[,4],cl62a[,4],cl63a[,4],cl64a[,4],cl65a[,4],cl66a[,4],cl67a[,4])
stargazer(table6col1,table6col2,table6col3,table6col4,table6col5,table6col6,table6col7, se=se, p=p)


###############################################################################
# Table 6 - Estimated Effects on Graduation
# Panel B: Dependent variable: graduated after five years
###############################################################################

# Set up the subsets for each column & set up regression models

# All
matched.data6b <- subset(matched.data, select = c(gradin5, gpalscutoff,
                                                  gpaXgpalscutoff, gpaXgpagrcutoff,
                                                  clustervar, weights))
matched.data6b <- na.omit(matched.data6b)
table6col1b <- lm(gradin5 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = matched.data6b, weights=weights)


# HS grades < median
lowHS6b <- subset(lowHS, select = c(gradin5, gpalscutoff,
                                    gpaXgpalscutoff, gpaXgpagrcutoff,
                                    clustervar, weights))
lowHS6b <- na.omit(lowHS6b)
table6col2b <- lm(gradin5 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = lowHS6b, weights=weights)


# HS grades > median
highHS6b <- subset(highHS, select = c(gradin5, gpalscutoff,
                                      gpaXgpalscutoff, gpaXgpagrcutoff,
                                      clustervar, weights))
highHS6b <- na.omit(highHS6b)
table6col3b <- lm(gradin5 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = highHS6b, weights=weights)


# Male
male6b <- subset(male, select = c(gradin5, gpalscutoff,
                                  gpaXgpalscutoff, gpaXgpagrcutoff,
                                  clustervar, weights))
male6b <- na.omit(male6b)
table6col4b <- lm(gradin5 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = male6b, weights=weights)


# Female
female6b <- subset(female, select = c(gradin5, gpalscutoff,
                                      gpaXgpalscutoff, gpaXgpagrcutoff,
                                      clustervar, weights))
female6b <- na.omit(female6b)
table6col5b <- lm(gradin5 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = female6b, weights=weights)


# English
english6b <- subset(english, select = c(gradin5, gpalscutoff,
                                        gpaXgpalscutoff, gpaXgpagrcutoff,
                                        clustervar, weights))
english6b <- na.omit(english6b)
table6col6b <- lm(gradin5 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = english6b, weights=weights)


# Nonnative English
noenglish6b <- subset(noenglish, select = c(gradin5, gpalscutoff,
                                            gpaXgpalscutoff, gpaXgpagrcutoff,
                                            clustervar, weights))
noenglish6b <- na.omit(noenglish6b)
table6col7b <- lm(gradin5 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = noenglish6b, weights=weights)

# Output displays results in Table 6 Panel B
cl61b <- cl(matched.data6b, table6col1b, matched.data6b$clustervar)
cl62b <- cl(lowHS6b, table6col2b, lowHS6b$clustervar)
cl63b <- cl(highHS6b, table6col3b, highHS6b$clustervar)
cl64b <- cl(male6b, table6col4b, male6b$clustervar)
cl65b <- cl(female6b, table6col5b, female6b$clustervar)
cl66b <- cl(english6b, table6col6b, english6b$clustervar)
cl67b <- cl(noenglish6b, table6col7b, noenglish6b$clustervar)


se <- list(cl61b[,2],cl62b[,2],cl63b[,2],cl64b[,2],cl65b[,2],cl66b[,2],cl67b[,2])
p <- list(cl61b[,4],cl62b[,4],cl63b[,4],cl64b[,4],cl65b[,4],cl66b[,4],cl67b[,4])
stargazer(table6col1b,table6col2b,table6col3b,table6col4b,table6col5b,table6col6b,table6col7b, se=se, p=p)


###############################################################################
# Table 6 - Estimated Effects on Graduation
# Panel C: Dependent variable: graduated after six years
###############################################################################

# Set up the subsets for each column & set up regression models
# All
matched.data6c <- subset(matched.data, select = c(gradin6, gpalscutoff,
                                                  gpaXgpalscutoff, gpaXgpagrcutoff,
                                                  clustervar, weights))
matched.data6c <- na.omit(matched.data6c)
table6col1c <- lm(gradin6 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = matched.data6c, weights=weights)

# HS grades < median
lowHS6c <- subset(lowHS, select = c(gradin6, gpalscutoff,
                                    gpaXgpalscutoff, gpaXgpagrcutoff,
                                    clustervar, weights))
lowHS6c <- na.omit(lowHS6c)
table6col2c <- lm(gradin6 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = lowHS6c, weights=weights)


# HS grades > median
highHS6c <- subset(highHS, select = c(gradin6, gpalscutoff,
                                      gpaXgpalscutoff, gpaXgpagrcutoff,
                                      clustervar, weights))
highHS6c <- na.omit(highHS6c)
table6col3c <- lm(gradin6 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = highHS6c, weights=weights)


# Male
male6c <- subset(male, select = c(gradin6, gpalscutoff,
                                  gpaXgpalscutoff, gpaXgpagrcutoff,
                                  clustervar, weights))
male6c <- na.omit(male6c)
table6col4c <- lm(gradin6 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = male6c, weights=weights)



# Female
female6c <- subset(female, select = c(gradin6, gpalscutoff,
                                      gpaXgpalscutoff, gpaXgpagrcutoff,
                                      clustervar, weights))
female6c <- na.omit(female6c)
table6col5c <- lm(gradin6 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = female6c, weights=weights)



# English
english6c <- subset(english, select = c(gradin6, gpalscutoff,
                                        gpaXgpalscutoff, gpaXgpagrcutoff,
                                        clustervar, weights))
english6c <- na.omit(english6c)
table6col6c <- lm(gradin6 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = english6c, weights=weights)

# Nonnative English
noenglish6c <- subset(noenglish, select = c(gradin6, gpalscutoff,
                                            gpaXgpalscutoff, gpaXgpagrcutoff,
                                            clustervar, weights))
noenglish6c <- na.omit(noenglish6c)
table6col7c <- lm(gradin6 ~ gpalscutoff + gpaXgpalscutoff + gpaXgpagrcutoff,
                  data = noenglish6c, weights=weights)

# Output displays results in Table 6 Panel C
cl61c <- cl(matched.data6c, table6col1c, matched.data6c$clustervar)
cl62c <- cl(lowHS6c, table6col2c, lowHS6c$clustervar)
cl63c <- cl(highHS6c, table6col3c, highHS6c$clustervar)
cl64c <- cl(male6c, table6col4c, male6c$clustervar)
cl65c <- cl(female6c, table6col5c, female6c$clustervar)
cl66c <- cl(english6c, table6col6c, english6c$clustervar)
cl67c <- cl(noenglish6c, table6col7c, noenglish6c$clustervar)


se <- list(cl61c[,2],cl62c[,2],cl63c[,2],cl64c[,2],cl65c[,2],cl66c[,2],cl67c[,2])
p <- list(cl61c[,4],cl62c[,4],cl63c[,4],cl64c[,4],cl65c[,4],cl66c[,4],cl67c[,4])
stargazer(table6col1c,table6col2c,table6col3c,table6col4c,table6col5c,table6col6c,table6col7c, se=se, p=p)

#####################################################
# Simulations
# Left Schools
#####################################################


#############
# ALL
#############

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights")
data.for.sim <- matched.data[myvars]

# Delete rows containing missing data
data.for.sim <- na.omit(data.for.sim)

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$left_school
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
            w=W,
             method = "BFGS", control = list(fnscale = -1),
            hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1 <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0 <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
par(xpd=FALSE)
plot(density(pred.pi0), col="blue",
     main = "All",
     xlab = "Proportion leaving school",
     ylim = c(0,60),
     xlim = c(0,.12),
     sub= "") 
lines(density(pred.pi1), lty=2, col = "red")
legend(.2,50, 
       c("Good Standing","Under Probation"),
       lty=c(1,2),
       lwd=c(2.5,2.5),col=c("blue","red")) 



#####################
# HS Grades < median
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS")
data.for.sim <- matched.data[myvars]

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$lowHS==1,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$left_school
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.low <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.low <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.low), col="blue",
     main = "HS grades < median",
     xlab = "Proportion leaving school",
     ylim = c(0,60),
     xlim = c(0,.12),
     sub= "") 
lines(density(pred.pi1.low), lty=2, col = "red")
legend(.2,50, 
       c("Good Standing","Under Probation"),
       lty=c(1,2),
       lwd=c(2.5,2.5),col=c("blue","red")) 


#####################
# HS Grades > median
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS")
data.for.sim <- matched.data[myvars]

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$highHS==1,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$left_school
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.high <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.high <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.high), col="blue",
     main = "HS grades > median",
     xlab = "Proportion leaving school",
     ylim = c(0,60),
     xlim = c(0,.12),
     sub= "") 
lines(density(pred.pi1.high), lty=2, col = "red")
legend(.2,50, 
       c("Good Standing","Under Probation"),
       lty=c(1,2),
       lwd=c(2.5,2.5),col=c("blue","red")) 



#####################
# Male
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS")
data.for.sim <- matched.data[myvars]

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$male==1,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$left_school
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.male <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.male <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.male), col="blue",
     main = "Males",
     xlab = "Proportion leaving school",
     ylim = c(0,60),
     xlim = c(0,.12),
     sub= "") 
lines(density(pred.pi1.male), lty=2, col = "red")
legend(.2,50, 
       c("Good Standing","Under Probation"),
       lty=c(1,2),
       lwd=c(2.5,2.5),col=c("blue","red")) 




#####################
# Female
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS")
data.for.sim <- matched.data[myvars]

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$male==0,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$left_school
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.female <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.female <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.female), col="blue",
     main = "Females",
     xlab = "Proportion leaving school",
     ylim = c(0,60),
     xlim = c(0,.12),
     sub= "") 
lines(density(pred.pi1.female), lty=2, col = "red")
legend(.2,50, 
       c("Good Standing","Under Probation"),
       lty=c(1,2),
       lwd=c(2.5,2.5),col=c("blue","red")) 



#####################
# Native English
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS")
data.for.sim <- matched.data[myvars]

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$english==1,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$left_school
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.english <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.english <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.english), col="blue",
     main = "Native English",
     xlab = "Proportion leaving school",
     ylim = c(0,60),
     xlim = c(0,.12),
     sub= "") 
lines(density(pred.pi1.english), lty=2, col = "red")
legend(.2,50, 
       c("Good Standing","Under Probation"),
       lty=c(1,2),
       lwd=c(2.5,2.5),col=c("blue","red")) 




#####################
# Nonnative English
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS")
data.for.sim <- matched.data[myvars]

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$english==0,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$left_school
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.nonenglish <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.nonenglish <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.nonenglish), col="blue",
     main = "Nonnative English",
     xlab = "Proportion leaving school",
     ylim = c(0,60),
     xlim = c(0,.12),
     sub= "") 
lines(density(pred.pi1.nonenglish), lty=2, col = "red")
legend(.2,50, 
       c("Good Standing","Under Probation"),
       lty=c(1,2),
       lwd=c(2.5,2.5),col=c("blue","red")) 


plot(density(pred.pi0), col="blue",
     main = "All",
     xlab = "Proportion leaving school",
     ylim = c(0,60),
     xlim = c(0,.12),
     sub= "") 
lines(density(pred.pi1), lty=2, col = "red")
#plot.new()
plot(density(pred.pi0.low), col="blue",
     main = "HS grades < median",
     xlab = "Proportion leaving school",
     ylim = c(0,60),
     xlim = c(0,.12),
     sub= "") 
lines(density(pred.pi1.low), lty=2, col = "red")
plot(density(pred.pi0.high), col="blue",
     main = "HS grades > median",
     xlab = "Proportion leaving school",
     ylim = c(0,60),
     xlim = c(0,.12),
     sub= "") 
lines(density(pred.pi1.high), lty=2, col = "red")
plot(density(pred.pi0.male), col="blue",
     main = "Males",
     xlab = "Proportion leaving school",
     ylim = c(0,60),
     xlim = c(0,.12),
     sub= "") 
lines(density(pred.pi1.male), lty=2, col = "red")
plot(density(pred.pi0.female), col="blue",
     main = "Females",
     xlab = "Proportion leaving school",
     ylim = c(0,60),
     xlim = c(0,.12),
     sub= "") 
lines(density(pred.pi1.female), lty=2, col = "red")
plot(density(pred.pi0.english), col="blue",
     main = "Native English",
     xlab = "Proportion leaving school",
     ylim = c(0,60),
     xlim = c(0,.12),
     sub= "") 
lines(density(pred.pi1.english), lty=2, col = "red")
plot(density(pred.pi0.nonenglish), col="blue",
     main = "Nonnative English",
     xlab = "Proportion leaving school",
     ylim = c(0,60),
     xlim = c(0,.12),
     sub= "") 
lines(density(pred.pi1.nonenglish), lty=2, col = "red")
dev.off()


#####################################################
# Simulations
# Subsequent GPA
#####################################################


#############
# ALL
#############

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights","nextGPA")
data.for.sim <- matched.data[myvars]

# Delete rows containing missing data
data.for.sim <- na.omit(data.for.sim)

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$nextGPA
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1 <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0 <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0), col="blue",
     main = "All",
     xlab = "Subsequent GPA minus probation cutoff",
     ylim = c(0,20),
     xlim = c(0,1),
     sub= "") 
lines(density(pred.pi1), lty=2, col = "red")



#####################
# HS Grades < median
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS","nextGPA")
data.for.sim <- matched.data[myvars]

data.for.sim <- na.omit(data.for.sim)

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$lowHS==1,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$nextGPA
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.low <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.low <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.low), col="blue",
     main = "HS grades < median",
     xlab = "Subsequent GPA minus probation cutoff",
     ylim = c(0,20),
     xlim = c(0,1),
     sub= "") 
lines(density(pred.pi1.low), lty=2, col = "red")


#####################
# HS Grades > median
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS", "nextGPA")
data.for.sim <- matched.data[myvars]

data.for.sim <- na.omit(data.for.sim)

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$highHS==1,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$nextGPA
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.high <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.high <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.high), col="blue",
     main = "HS grades > median",
     xlab = "Subsequent GPA minus probation cutoff",
     ylim = c(0,20),
     xlim = c(0,1),
     sub= "") 
lines(density(pred.pi1.high), lty=2, col = "red")




#####################
# Male
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS", "nextGPA")
data.for.sim <- matched.data[myvars]

data.for.sim <- na.omit(data.for.sim)

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$male==1,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$nextGPA
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.male <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.male <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.male), col="blue",
     main = "Males",
     xlab = "Subsequent GPA minus probation cutoff",
     ylim = c(0,20),
     xlim = c(0,1),
     sub= "") 
lines(density(pred.pi1.male), lty=2, col = "red")



#####################
# Female
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS", "nextGPA")
data.for.sim <- matched.data[myvars]

data.for.sim <- na.omit(data.for.sim)

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$male==0,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$nextGPA
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.female <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.female <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.female), col="blue",
     main = "Females",
     xlab = "Subsequent GPA minus probation cutoff",
     ylim = c(0,20),
     xlim = c(0,1),
     sub= "") 
lines(density(pred.pi1.female), lty=2, col = "red")



#####################
# Native English
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS", "nextGPA")
data.for.sim <- matched.data[myvars]

data.for.sim <- na.omit(data.for.sim)

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$english==1,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$nextGPA
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.english <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.english <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.english), col="blue",
     main = "Native English",
     xlab = "Subsequent GPA minus probation cutoff",
     ylim = c(0,20),
     xlim = c(0,1),
     sub= "") 
lines(density(pred.pi1.english), lty=2, col = "red")



#####################
# Nonnative English
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS", "nextGPA")
data.for.sim <- matched.data[myvars]

data.for.sim <- na.omit(data.for.sim)


# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$english==0,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$nextGPA
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.nonenglish <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.nonenglish <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.nonenglish), col="blue",
     main = "Nonnative English",
     xlab = "Subsequent GPA minus probation cutoff",
     ylim = c(0,20),
     xlim = c(0,1),
     sub= "") 
lines(density(pred.pi1.nonenglish), lty=2, col = "red")



plot(density(pred.pi0), col="blue",
     main = "All",
     xlab = "Subsequent GPA minus probation cutoff",
     ylim = c(0,20),
     xlim = c(0,1),
     sub= "") 
lines(density(pred.pi1), lty=2, col = "red")
#plot.new()
plot(density(pred.pi0.low), col="blue",
     main = "HS grades < median",
     xlab = "Subsequent GPA minus probation cutoff",
     ylim = c(0,20),
     xlim = c(0,1),
     sub= "") 
lines(density(pred.pi1.low), lty=2, col = "red")
plot(density(pred.pi0.high), col="blue",
     main = "HS grades > median",
     xlab = "Subsequent GPA minus probation cutoff",
     ylim = c(0,20),
     xlim = c(0,1),
     sub= "") 
lines(density(pred.pi1.high), lty=2, col = "red")
plot(density(pred.pi0.male), col="blue",
     main = "Males",
     xlab = "Subsequent GPA minus probation cutoff",
     ylim = c(0,20),
     xlim = c(0,1),
     sub= "") 
lines(density(pred.pi1.male), lty=2, col = "red")
plot(density(pred.pi0.female), col="blue",
     main = "Females",
     xlab = "Subsequent GPA minus probation cutoff",
     ylim = c(0,20),
     xlim = c(0,1),
     sub= "") 
lines(density(pred.pi1.female), lty=2, col = "red")
plot(density(pred.pi0.english), col="blue",
     main = "Native English",
     xlab = "Subsequent GPA minus probation cutoff",
     ylim = c(0,20),
     xlim = c(0,1),
     sub= "") 
lines(density(pred.pi1.english), lty=2, col = "red")
plot(density(pred.pi0.nonenglish), col="blue",
     main = "Nonnative English",
     xlab = "Subsequent GPA minus probation cutoff",
     ylim = c(0,20),
     xlim = c(0,1),
     sub= "") 
lines(density(pred.pi1.nonenglish), lty=2, col = "red")
dev.off()


#####################################################
# Simulations
# Graduate in 6 years
#####################################################


#############
# ALL
#############

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights","gradin6")
data.for.sim <- matched.data[myvars]

# Delete rows containing missing data
data.for.sim <- na.omit(data.for.sim)

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$gradin6
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1 <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0 <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0), col="blue",
     main = "All",
     xlab = "Proportion graduating within 6 years",
     ylim = c(0,20),
     xlim = c(.5,.8),
     sub= "") 
lines(density(pred.pi1), lty=2, col = "red")


#####################
# HS Grades < median
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS","gradin6")
data.for.sim <- matched.data[myvars]

data.for.sim <- na.omit(data.for.sim)

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$lowHS==1,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$gradin6
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.low <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.low <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.low), col="blue",
     main = "HS grades < median",
     xlab = "Proportion graduating within 6 years",
     ylim = c(0,20),
     xlim = c(.5,.8),
     sub= "") 
lines(density(pred.pi1.low), lty=2, col = "red")


#####################
# HS Grades > median
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS", "gradin6")
data.for.sim <- matched.data[myvars]

data.for.sim <- na.omit(data.for.sim)

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$highHS==1,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$gradin6
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.high <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.high <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.high), col="blue",
     main = "HS grades > median",
     xlab = "Proportion graduating within 6 years",
     ylim = c(0,20),
     xlim = c(.5,.8),
     sub= "") 
lines(density(pred.pi1.high), lty=2, col = "red")




#####################
# Male
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS", "gradin6")
data.for.sim <- matched.data[myvars]

data.for.sim <- na.omit(data.for.sim)

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$male==1,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$gradin6
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.male <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.male <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.male), col="blue",
     main = "Males",
     xlab = "Proportion graduating within 6 years",
     ylim = c(0,20),
     xlim = c(0.5,.8),
     sub= "") 
lines(density(pred.pi1.male), lty=2, col = "red")



#####################
# Female
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS", "gradin6")
data.for.sim <- matched.data[myvars]

data.for.sim <- na.omit(data.for.sim)

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$male==0,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$gradin6
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.female <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.female <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.female), col="blue",
     main = "Females",
     xlab = "Proportion graduating within 6 years",
     ylim = c(0,20),
     xlim = c(.5,.8),
     sub= "") 
lines(density(pred.pi1.female), lty=2, col = "red")



#####################
# Native English
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS", "gradin6")
data.for.sim <- matched.data[myvars]

data.for.sim <- na.omit(data.for.sim)

# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$english==1,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$gradin6
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.english <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.english <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.english), col="blue",
     main = "Native English",
     xlab = "Proportion graduating within 6 years",
     ylim = c(0,20),
     xlim = c(0.5,.8),
     sub= "") 
lines(density(pred.pi1.english), lty=2, col = "red")



#####################
# Nonnative English
#####################

# Subset the data
myvars <- c("left_school", "gpalscutoff", 
            "gpaXgpalscutoff", "gpaXgpagrcutoff",
            "male", "totcredits_year1",
            "hsgrade_pct", "age_at_entry", "english",
            "loc_campus1", "loc_campus3", "bpl_north_america",
            "weights", "lowHS", "highHS", "gradin6")
data.for.sim <- matched.data[myvars]

data.for.sim <- na.omit(data.for.sim)


# Subset for high HS
data.for.sim <- data.for.sim[data.for.sim$english==0,]

# Log-likelihood with weights
llnormal <- function(param,y,x,w) {
  cov <- x %*% param[1:ncol(x)]
  sigma <- exp(param[ncol(x)+1])
  -.5* sum( log(2*pi) + log((sigma^2)/w) + w*((y-cov)^2)/(sigma^2))
}

y <- data.for.sim$gradin6
X <- data.for.sim[,c("gpalscutoff", 
                     "gpaXgpalscutoff", "gpaXgpagrcutoff")]
Xmat <- as.matrix(cbind(1,X))
W <- data.for.sim$weights

opt <- optim(par = rep(0.01,5), fn = llnormal, y=y, x=Xmat,
             w=W,
             method = "BFGS", control = list(fnscale = -1),
             hessian = TRUE)
opt$par
stderrors <- diag(sqrt(solve(-opt$hessian)))
stderrors

# Simulate betas
varcv <- -solve(opt$hessian)
set.seed(02138)
library(mvtnorm)
sims <- rmvnorm(1000, mean = opt$par, sigma = varcv)

# Under probation
predCovs.treatment <- c(1,1,-0.01,0)

# NOT Under Probation
predCovs.control <- c(1,0,0,0.01)

# Get expected values
pred.pi1.nonenglish <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.treatment%*%x)
pred.pi0.nonenglish <- apply(sims[1:1000,1:4],1,function(x)
  predCovs.control%*%x)

# Plot expected values
plot(density(pred.pi0.nonenglish), col="blue",
     main = "Nonnative English",
     xlab = "Proportion graduating within 6 years",
     ylim = c(0,20),
     xlim = c(0.5,.8),
     sub= "") 
lines(density(pred.pi1.nonenglish), lty=2, col = "red")



plot(density(pred.pi0), col="blue",
     main = "All",
     xlab = "Proportion graduating within 6 years",
     ylim = c(0,20),
     xlim = c(0.5,.8),
     sub= "") 
lines(density(pred.pi1), lty=2, col = "red")
#plot.new()
plot(density(pred.pi0.low), col="blue",
     main = "HS grades < median",
     xlab = "Proportion graduating within 6 years",
     ylim = c(0,20),
     xlim = c(0.5,.8),
     sub= "") 
lines(density(pred.pi1.low), lty=2, col = "red")
plot(density(pred.pi0.high), col="blue",
     main = "HS grades > median",
     xlab = "Proportion graduating within 6 years",
     ylim = c(0,20),
     xlim = c(0.5,.8),
     sub= "") 
lines(density(pred.pi1.high), lty=2, col = "red")
plot(density(pred.pi0.male), col="blue",
     main = "Males",
     xlab = "Proportion graduating within 6 years",
     ylim = c(0,20),
     xlim = c(0.5,.8),
     sub= "") 
lines(density(pred.pi1.male), lty=2, col = "red")
plot(density(pred.pi0.female), col="blue",
     main = "Females",
     xlab = "Proportion graduating within 6 years",
     ylim = c(0,20),
     xlim = c(0.5,.8),
     sub= "") 
lines(density(pred.pi1.female), lty=2, col = "red")
plot(density(pred.pi0.english), col="blue",
     main = "Native English",
     xlab = "Proportion graduating within 6 years",
     ylim = c(0,20),
     xlim = c(0.5,.8),
     sub= "") 
lines(density(pred.pi1.english), lty=2, col = "red")
plot(density(pred.pi0.nonenglish), col="blue",
     main = "Nonnative English",
     xlab = "Proportion graduating within 6 years",
     ylim = c(0,20),
     xlim = c(0.5,.8),
     sub= "") 
lines(density(pred.pi1.nonenglish), lty=2, col = "red")
dev.off()
